The whole analysis will use a count dataset of glioblastoma transcriptomic samples. This dataset contains 5 samples from Recurrent Tumor and 5 from Primary Tumor.
install.packages("gplots")
library("gplots")
raw_counts <- read.csv('/home/hp/Documents/HackbioCancer/glioblastoma_count_file.csv', row.names=1)
mat <- as.matrix(raw_counts)
log_data_matrix <- log2(mat+1) #log transformation of matrix as data range is too broad
head(log_data_matrix)
## TCGA.19.4065.02A.11R.2005.01 TCGA.19.0957.02A.11R.2005.01
## ENSG00000272398 9.577429 12.144340
## ENSG00000135439 11.430453 13.033595
## ENSG00000130348 9.876517 9.733015
## ENSG00000198719 7.857981 10.307201
## ENSG00000169429 9.079485 9.002815
## ENSG00000171608 10.325305 9.493855
## TCGA.06.0152.02A.01R.2005.01 TCGA.14.1402.02A.01R.2005.01
## ENSG00000272398 9.417853 10.830515
## ENSG00000135439 11.432542 8.204571
## ENSG00000130348 10.288866 10.450180
## ENSG00000198719 9.675957 8.845490
## ENSG00000169429 9.357552 11.497852
## ENSG00000171608 10.727070 8.049849
## TCGA.14.0736.02A.01R.2005.01 TCGA.06.5410.01A.01R.1849.01
## ENSG00000272398 11.604553 8.154818
## ENSG00000135439 9.483816 9.679480
## ENSG00000130348 9.022368 9.941048
## ENSG00000198719 6.845490 8.257388
## ENSG00000169429 10.222795 15.353905
## ENSG00000171608 8.422065 11.159241
## TCGA.19.5960.01A.11R.1850.01 TCGA.14.0781.01B.01R.1849.01
## ENSG00000272398 18.906104 9.891784
## ENSG00000135439 16.349575 9.592457
## ENSG00000130348 15.855793 10.051209
## ENSG00000198719 17.335643 8.400879
## ENSG00000169429 7.087463 12.823566
## ENSG00000171608 8.693487 10.207014
## TCGA.02.2483.01A.01R.1849.01 TCGA.06.2570.01A.01R.1849.01
## ENSG00000272398 14.45128 13.28135
## ENSG00000135439 10.55555 14.38256
## ENSG00000130348 10.60177 10.74567
## ENSG00000198719 12.40008 11.99081
## ENSG00000169429 10.90087 11.86766
## ENSG00000171608 14.95233 10.21796
col_palette <- colorRampPalette(c('blue','yellow','red'))(n=300)
heatmap.2(x=log_data_matrix, col = col_palette, density.info = 'none')
col_palette <- colorRampPalette(c('blue','yellow'))(n=300)
heatmap.2(x=log_data_matrix, col = col_palette, density.info = 'none')
col_palette <- colorRampPalette(c('blue','yellow','red'))(n=300)
heatmap.2(x=log_data_matrix, col = col_palette, density.info = 'none', dendrogram = 'row', Rowv = TRUE, Colv = FALSE)
par(mar = c(10, 10, 10, 10))
col_palette <- colorRampPalette(c('blue','yellow','red'))(n=300)
heatmap.2(x=log_data_matrix, col = col_palette, density.info = 'none', dendrogram = 'column', Rowv = FALSE, Colv = TRUE, lmat = rbind(c(4, 3), c(2, 1)), # Layout matrix
lhei = c(1, 8), # Increase the height of the heatmap body
lwid = c(0.2, 1), # Adjust widths for the heatmap and key
cexCol = 0.7, srtCol = 20)
col_palette <- colorRampPalette(c('blue','yellow','red'))(n=300)
heatmap.2(x=log_data_matrix, col = col_palette, density.info = 'none', Rowv = TRUE, Colv = TRUE)
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("DESeq2")
library("DESeq2")
#Creating Sample Table
sample_table <- data.frame(
sampleName <- colnames(raw_counts),
condition = c("Recurret Tumor","Recurret Tumor","Recurret Tumor","Recurret Tumor","Recurret Tumor","Primary Tumor","Primary Tumor","Primary Tumor","Primary Tumor","Primary Tumor")
)
sample_table$condition <- as.factor(sample_table$condition)
head(sample_table)
## sampleName....colnames.raw_counts. condition
## 1 TCGA.19.4065.02A.11R.2005.01 Recurret Tumor
## 2 TCGA.19.0957.02A.11R.2005.01 Recurret Tumor
## 3 TCGA.06.0152.02A.01R.2005.01 Recurret Tumor
## 4 TCGA.14.1402.02A.01R.2005.01 Recurret Tumor
## 5 TCGA.14.0736.02A.01R.2005.01 Recurret Tumor
## 6 TCGA.06.5410.01A.01R.1849.01 Primary Tumor
dds <- DESeqDataSetFromMatrix(countData = raw_counts, colData = sample_table, design = ~ condition)
dds <- DESeq(dds)
results_table <- results(dds)
significant_genes <- subset(results_table, padj < 0.05)
significant_genes_up <- subset(results_table, padj < 0.05 & log2FoldChange > 1)
significant_genes_down <- subset(results_table, padj < 0.05 & log2FoldChange < -1)
background_genes <- data.frame(gene = rownames(results_table), log2FC = results_table$log2FoldChange, pval = results_table$pvalue)
DE_significantly_up_genes <- data.frame(gene = rownames(significant_genes_up), log2FC = significant_genes_up$log2FoldChange, pval = significant_genes_up$pvalue)
DE_significantly_down_genes <- data.frame(gene = rownames(significant_genes_down), log2FC = significant_genes_down$log2FoldChange, pval = significant_genes_down$pvalue)
head(DE_significantly_up_genes)
## gene log2FC pval
## 1 ENSG00000202111 4.474860 0.009668808
## 2 ENSG00000118231 3.852969 0.002850270
## 3 ENSG00000286404 3.941243 0.011999617
## 4 ENSG00000287872 3.580704 0.009825419
## 5 ENSG00000288525 5.078708 0.003843982
## 6 ENSG00000259518 5.086102 0.002023824
# Exporting into a CSV file
write.csv(background_genes, "background_genes.csv", row.names = FALSE)
write.csv(DE_significantly_up_genes, "DE_significantly_up_genes.csv", row.names = FALSE)
write.csv(DE_significantly_down_genes, "DE_significantly_down_genes.csv", row.names = FALSE)
Using the default parameters in ShinyGO 0.80, for 89 significantly differential expressed genes and a background set of 582 genes, we found the following top 10 enriched KEGG pathways.